library(plyr)
library(tidyverse)
library(stargazer)
library(OasisR)

# load data
# set working directory:
data <- read.csv("Data_jan2019.csv")

# identify numeric variables
numeric_vars <- c("ready", "plus23", 
                  "most.divers_1", "most.divers_2", "most.divers_3", "most.divers_4",
                  "prcv_US_1", "prcv_US_2", "prcv_US_3", "prcv_US_4",
                  "age")

# identify factor variable
factor_vars <- c("DistributionChannel",
                 "letin", "affrmact1", "affrmact2", "affrmact3", "betterplace",
                 "gender", "race", "born", "educ", "faminc")

# convert class type
data <- data %>% mutate_at(vars(numeric_vars), list(as.numeric)) %>%
  mutate_at(vars(factor_vars), list(as.factor)) %>%
  
# remove observations where the distribution channel was "preview" rather than anonymous 
# (aka it was us previewing rather than a respondent)
  filter(DistributionChannel == "anonymous") %>%
# remove the DistributionChannel variable since it is no longer informative
  select(-DistributionChannel) 

# remove observations where the respondent failed the bot screeners 
# (response should be 5 for both "ready" and "plus23") 
data <- data %>% filter(ready == 5 & plus23 == 5) %>%
# remove those variables as well          
  select(-c(ready, plus23))  
  
## recode education
data$educ <- recode(data$educ, `1` = "Less than HS",
                    `2` = "HS/GED",
                    `3` = "Some college",
                    `4` = "2-year degree",
                    `5` = "4-year degree",
                    `6` = "Graduate degree")
data$educ <- factor(data$educ, levels = c("Less than HS", 
                                          "HS/GED",
                                          "2-year degree",
                                          "Some college",
                                          "4-year degree",
                                          "Graduate degree"))

## recode nativity
data$born <- recode(data$born, `1` = "US", 
                    `2` = "US territory", 
                    `3` = "Some other country")
data$born <- factor(data$born, levels = c("US", 
                                          "US territory", 
                                          "Some other country"))

# recode income to midpoints
# use Hout's pareto method for last open-ended category (see Hout 2004)
###########################################################
V <- (log(18 + 10) - log(10))/(log(150000) - log(100000)) #
M <- 150000*0.5*(1 + (V/(V - 1)))                         #
###########################################################
data$faminc_num <- recode(data$faminc, `1` = 5000,
                          `2` = (10000 + 19999)/2,
                          `3` = (20000 + 29999)/2,
                          `4` = (30000 + 39999)/2,
                          `5` = (40000 + 49999)/2,
                          `6` = (50000 + 59999)/2,
                          `7` = (60000 + 69999)/2,
                          `8` = (70000 + 79999)/2,
                          `9` = (80000 + 89999)/2,
                          `10` = (90000 + 99999)/2,
                          `11` = (100000 + 149999)/2,
                          `12` = M,
                          `13` = 0)
# recode rather not say (currently "0") to missing
data[data$faminc_num == 0, "faminc_num"] <- NA

## rescale income
data$faminc_num <- data$faminc_num / 1000

## recode gender
data$gender <- recode(data$gender, `1` = "Male", 
                      `2`= "Female", 
                      `3` = "Neither Male nor Female")
# make "Female" the reference category
data$gender <- fct_relevel(data$gender, "Female", "Male", "Neither Male nor Female")

## recode race
data$race <- recode(data$race, `1` = "White",
                    `2` = "Black/African American",
                    `3` = "Hispanic/Latino",
                    `4` = "Asian/Asian American",
                    `5` = "American Indian/ or Alaska Native",
                    `6` = "Native Hawaiian or Pacific Islander",
                    `7` = "Middle Eastern or North African",
                    `8` = "Something else")

# use the DMulti function in the OasisR package to calculate "dissimilarity" 
# between most_diverse and benchmarks

## first, create four variables for equal_rep that is each equal to 25
data <- data %>% mutate(perfect.rep_1 = rep(25, nrow(data)),
                        perfect.rep_2 = rep(25, nrow(data)),
                        perfect.rep_3 = rep(25, nrow(data)),
                        perfect.rep_4 = rep(25, nrow(data)))

# write function for a loop that will calculate the DI for each observation
Dissim_function <- function(dataset, benchmark){
  # create container to store values
  container_vector <- vector(mode = "numeric", length = nrow(dataset))
  # loop through each row
  for (i in 1:nrow(dataset)){
    # save composition of "most diverse"" nbhd
    unit1 <- select(dataset, grep("most.divers", colnames(data)))[i, ]
    # save composition of benchmark nbhd
    unit2 <- select(dataset, grep(benchmark, colnames(data)))[i, ]
    # rename column names for binding purposes only; bind
    colnames(unit1) <- colnames(unit2)
    matrix <- as.matrix(bind_rows(unit1, unit2))
    # calculate Dissimilarity Index for row i
    DI <- DMulti(matrix)
    # save value in container
    container_vector[i] <- DI
  }
  # function returns a vector of DIs  
  return(container_vector)
}

# create variables and save DI measures for perfect.rep, prcv_US
data <- data %>% mutate(DI_rep = Dissim_function(data, "perfect.rep"),
                        DI_prcvUS = Dissim_function(data, "prcv_US"))

## define DI benchmark: US composition or even representation?
data$DI_UScomp_01 <- ifelse(data$DI_rep < data$DI_prcvUS, 0, 1)

## recode attitudinal variables
data$letin <- mapvalues(data$letin, 
                        from = seq(1:3), 
                        to = c("Present level", "Increased", "Decreased"))

data$letin <- factor(data$letin, levels = c("Decreased", "Present level", "Increased"))

data$affrmact5[data$affrmact2 == 1] <- "Strongly favor"
data$affrmact5[data$affrmact2 == 2] <- "Not strongly favor"
data$affrmact5[data$affrmact1 == 3] <- "Neither favor nor oppose"
data$affrmact5[data$affrmact3 == 1] <- "Strongly oppose"
data$affrmact5[data$affrmact3 == 2] <- "Not strongly oppose"

data$affrmact5 <- factor(data$affrmact5, levels = c("Strongly oppose", 
                                                    "Not strongly oppose", 
                                                    "Neither favor nor oppose",
                                                    "Not strongly favor", 
                                                    "Strongly favor"))

## subset data (whites)
wht <- data[data$race == "White", ]

## how often is "even composition" the most diverse neighborhood?
nrow(data[data$DI_rep == 0,]) / nrow(data)

## how prevalent is each benchmark?
prop.table(table(data$DI_UScomp_01))
prop.table(table(wht$DI_UScomp_01))

## US composition benchmark by immigration attitudes
m <- glm(DI_UScomp_01 ~ letin + age + gender +
              educ + faminc_num + born, family = "binomial", data = wht)

stargazer(m, type = "text", star.cutoffs = c(.05, .01, .001))

## US composition benchmark by affirmative action attitudes
m2 <- glm(DI_UScomp_01 ~ affrmact5 + age + gender +
           educ + faminc_num + born, family = "binomial", data = wht)

stargazer(m2, type = "text", star.cutoffs = c(.05, .01, .001))

